PCA

Load Data

library("tidyverse")

Load data

Data will be fetched from hbiostat from vanderbilt department of biostatistics.

raw_dir <- "../data/_raw/"
data_file <- "prostate.rda"
data_loc <- "https://hbiostat.org/data/repo/"

if( !dir.exists(raw_dir) ){
  dir.create(path = raw_dir, recursive = TRUE)
}

if( !file.exists(str_c(raw_dir, data_file)) ){
  download.file(
    url = str_c(data_loc, data_file),
    destfile = str_c(raw_dir, data_file),
    mode = "wb")
}
load(file = str_c(raw_dir, data_file))

Metadata

data_dir <- "../data/"
meta_tsv_file <- "01_meta_dat_load.tsv"
if (!file.exists(str_c(data_dir, meta_tsv_file)) ) {
  prostate |>
    select(patno, stage, status, age, wt, pf, hx, sdate) |>
    write_tsv(file = str_c(data_dir, meta_tsv_file))
}

Treatment data

treatment_tsv_file <- "01_treatment_dat_load.tsv"
if (!file.exists(str_c(data_dir, treatment_tsv_file)) ) {
  prostate |>
  select(-stage, -status, -age, -wt, -pf, -hx, -sdate) |>
    write_tsv(file = str_c(data_dir, treatment_tsv_file))
}

Tidying Data

library("tidyverse")
set.seed(2025)

A quick look

First we’ll take a quick look at the dataset to see how we can tidy it

# Load the data saved in "01_load.qmd"
data_dir <- "../data/"
meta_df <- read_tsv(str_c(data_dir, "01_meta_dat_load.tsv"))
Rows: 502 Columns: 8
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr  (2): status, pf
dbl  (5): patno, stage, age, wt, hx
date (1): sdate

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
treatment_df <- read_tsv(str_c(data_dir, "01_treatment_dat_load.tsv"))
Rows: 502 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): rx, ekg
dbl (9): patno, dtime, sbp, dbp, hg, sz, sg, ap, bm

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#Print random samples
meta_df |> slice_sample(n = 5) |> print()
# A tibble: 5 × 8
  patno stage status                     age    wt pf              hx sdate     
  <dbl> <dbl> <chr>                    <dbl> <dbl> <chr>        <dbl> <date>    
1   397     4 dead - prostatic ca         76    76 normal acti…     0 1978-08-10
2   460     4 dead - heart or vascular    75   124 normal acti…     0 1978-07-30
3   420     3 dead - heart or vascular    72   128 normal acti…     0 1978-09-12
4   410     4 alive                       71   104 normal acti…     0 1979-05-23
5   449     3 dead - heart or vascular    78   103 normal acti…     0 1978-08-07
treatment_df |> slice_sample(n = 5) |> print()
# A tibble: 5 × 11
  patno rx              dtime   sbp   dbp ekg       hg    sz    sg      ap    bm
  <dbl> <chr>           <dbl> <dbl> <dbl> <chr>  <dbl> <dbl> <dbl>   <dbl> <dbl>
1   279 5.0 mg estrogen    29    13     8 normal  12.7    17    13 1.00e+3     1
2   187 0.2 mg estrogen    67    16     6 heart…  13       4     9 4.00e-1     0
3   266 5.0 mg estrogen    53    13     7 rhyth…  13.4    17    11 2.78e+2     1
4   461 0.2 mg estrogen    40    10     6 heart…  14.7     2    12 8.20e+0     0
5   369 5.0 mg estrogen    52    16     8 old MI  10       1     7 6.00e-1     0

Joining the tables

Both tables contain information about a patient based on the patient ID in the patno column. We perform an inner join based on patno to have all variables in one dataframe. If there are observations with no associated ID it will be discarded however this should not be the case and will be checked later.

#Perform the inner join
full_df <- inner_join(x = meta_df,
                      y = treatment_df,
                      by = "patno")
full_df |> slice_sample(n = 5) |> print()
# A tibble: 5 × 18
  patno stage status    age    wt pf       hx sdate      rx    dtime   sbp   dbp
  <dbl> <dbl> <chr>   <dbl> <dbl> <chr> <dbl> <date>     <chr> <dbl> <dbl> <dbl>
1   460     4 dead -…    75   124 norm…     0 1978-07-30 1.0 …    30    15    10
2   388     4 dead -…    79    99 in b…     0 1978-03-29 0.2 …    11    16     8
3    27     4 alive      76    82 norm…     0 1977-11-09 plac…    69    12     8
4   472     3 dead -…    72   106 norm…     1 1978-12-12 0.2 …    33    13     8
5   379     4 alive      77    87 norm…     0 1977-05-31 plac…    75    11     6
# ℹ 6 more variables: ekg <chr>, hg <dbl>, sz <dbl>, sg <dbl>, ap <dbl>,
#   bm <dbl>

Checking for lost observations

If everything is as expected the the number of observations in full_df should match the number in meta_df and treatment_df and the number of variables should be the sum of varibles in the two minus one as patno exists in both.

cat("Size of meta_df:     ", dim(meta_df), "\n")
Size of meta_df:      502 8 
cat("Size of treatment_df:", dim(treatment_df), "\n")
Size of treatment_df: 502 11 
cat("Size of full_df:     ", dim(full_df))
Size of full_df:      502 18

As we can see, everything is as expected.

Splitting columns

As seen in the random sample of full_df the status column contains both dead/alive status and cause of death. We will split these to two columns.

full_df <- full_df |>
  separate(col = status,
           into = c("is_alive", "cause_of_death"),
           sep = " - ") |> 
  mutate(is_alive = factor(is_alive))
Warning: Expected 2 pieces. Missing pieces filled with `NA` in 148 rows [1, 5, 8, 9, 10,
11, 12, 13, 17, 20, 22, 23, 27, 41, 42, 44, 45, 49, 50, 57, ...].
full_df |> slice_sample(n = 20) |> print()
# A tibble: 20 × 19
   patno stage is_alive cause_of_death    age    wt pf       hx sdate      rx   
   <dbl> <dbl> <fct>    <chr>           <dbl> <dbl> <chr> <dbl> <date>     <chr>
 1   373     3 dead     other specific…    76   116 norm…     1 1979-05-17 0.2 …
 2   407     4 dead     prostatic ca       78    95 norm…     0 1978-12-06 plac…
 3    59     3 dead     cerebrovascular    78   118 norm…     1 1979-03-02 0.2 …
 4   395     4 dead     prostatic ca       81    98 norm…     1 1978-06-25 1.0 …
 5   159     4 alive    <NA>               78   116 norm…     0 1979-05-21 0.2 …
 6   142     3 dead     pulmonary embo…    83   115 norm…     0 1979-05-22 5.0 …
 7    37     4 dead     other specific…    72    93 norm…     0 1978-01-12 plac…
 8   288     4 dead     heart or vascu…    71   119 norm…     0 1978-05-22 plac…
 9    63     4 dead     prostatic ca       73   102 norm…     1 1977-10-27 0.2 …
10   285     4 dead     prostatic ca       59   100 norm…     0 1978-03-01 plac…
11   289     4 dead     prostatic ca       76    94 norm…     0 1978-10-18 0.2 …
12   252     3 dead     heart or vascu…    79    94 norm…     1 1979-06-04 5.0 …
13   345     3 dead     prostatic ca       70    83 norm…     0 1978-05-10 1.0 …
14   118     4 alive    <NA>               73   106 norm…     1 1979-05-08 1.0 …
15    88     3 dead     other ca           70    72 norm…     1 1977-12-13 1.0 …
16   195     4 dead     heart or vascu…    75    88 norm…     1 1977-09-18 plac…
17   174     4 dead     heart or vascu…    68   100 norm…     0 1978-07-10 5.0 …
18   428     4 dead     prostatic ca       56   100 norm…     0 1978-02-21 plac…
19   371     3 alive    <NA>               76    90 norm…     0 1979-05-01 5.0 …
20   306     3 dead     other specific…    62    88 norm…     1 1977-08-21 0.2 …
# ℹ 9 more variables: dtime <dbl>, sbp <dbl>, dbp <dbl>, ekg <chr>, hg <dbl>,
#   sz <dbl>, sg <dbl>, ap <dbl>, bm <dbl>

We will also split the treatment variable into dosage and treatment group

full_df <- full_df |>
  mutate("treatment" = case_when(str_detect(rx, "estrogen") ~ "estrogen",
                                 str_detect(rx, "placebo") ~ "placebo",
                                 TRUE ~ NA),
         dosage = as.numeric(str_extract(rx, "\\d+\\.\\d+")), #extract number dot number w regex
         dosage = as.numeric(replace_na(dosage, 0))
         ) |> 
  select(-rx)

Renaming variables

Some of the columns have somewhat non-descriptive names. For these new names will be assigned based on the information available at https://hbiostat.org/data/repo/cprostate.

full_df <- full_df |> rename(patient_no = "patno",
                             weight = "wt",
                             performance = "pf",
                             medical_history = "hx",
                             study_date = "sdate",
                             follow_up_months = "dtime",
                             sys_blood_pressure= "sbp",
                             dia_blood_pressure = "dbp",
                             serum_hemoglobin = "hg",
                             size_of_primary_tumor = "sz",
                             combined_index = "sg",
                             bone_metastases = "bm",
                             serum_prostatic_acid_phosphatase = "ap")

Data types

Finally we cast the stage, is_alive and performance variables to factors. medical_history and bone_metastases are cast to logical vectors.

full_df <- full_df |> mutate(stage = factor(as.integer(stage), levels = c(1,2,3,4), ordered = TRUE),
                             cause_of_death = factor(cause_of_death),
                             performance = factor(performance),
                             ekg = factor(ekg),
                             medical_history = medical_history == 1,
                             bone_metastases = bone_metastases == 1) # convert to logical

Overview

The table1 package is used to generate a quick overview to ensure everything is as it should be

table1::table1(full_df, x = formula(~stage + age + performance | is_alive + treatment))
alive
dead
Overall
estrogen
(N=116)
placebo
(N=32)
estrogen
(N=259)
placebo
(N=95)
estrogen
(N=375)
placebo
(N=127)
stage
1 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
2 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%) 0 (0%)
3 73 (62.9%) 23 (71.9%) 142 (54.8%) 51 (53.7%) 215 (57.3%) 74 (58.3%)
4 43 (37.1%) 9 (28.1%) 117 (45.2%) 44 (46.3%) 160 (42.7%) 53 (41.7%)
age
Mean (SD) 69.8 (7.33) 71.5 (5.92) 72.0 (7.19) 71.8 (6.61) 71.4 (7.30) 71.7 (6.42)
Median [Min, Max] 72.0 [49.0, 84.0] 73.0 [55.0, 79.0] 73.0 [48.0, 89.0] 73.0 [50.0, 84.0] 73.0 [48.0, 89.0] 73.0 [50.0, 84.0]
Missing 1 (0.9%) 0 (0%) 0 (0%) 0 (0%) 1 (0.3%) 0 (0%)
performance
confined to bed 0 (0%) 0 (0%) 2 (0.8%) 0 (0%) 2 (0.5%) 0 (0%)
in bed < 50% daytime 2 (1.7%) 2 (6.3%) 26 (10.0%) 7 (7.4%) 28 (7.5%) 9 (7.1%)
in bed > 50% daytime 2 (1.7%) 0 (0%) 6 (2.3%) 5 (5.3%) 8 (2.1%) 5 (3.9%)
normal activity 112 (96.6%) 30 (93.8%) 225 (86.9%) 83 (87.4%) 337 (89.9%) 113 (89.0%)

Save the data to drive

#save a compressed tsv
data_dir <- "../data/"
tidy_tsv_file <- "02_dat_tidy.tsv.gz"

full_df |> write_tsv(file = str_c(data_dir, tidy_tsv_file))

Augment Data

Load library

library("tidyverse")
source("99_proj_func.R")

Load data

Loading the prepared tidy data.

data_dir <- "data/"
tidy_file <- "02_dat_tidy.tsv.gz"

full_df <- read_tsv("../data/02_dat_tidy.tsv.gz")
full_df |> slice_sample(n = 10)
# A tibble: 10 × 20
   patient_no stage is_alive cause_of_death        age weight performance       
        <dbl> <dbl> <chr>    <chr>               <dbl>  <dbl> <chr>             
 1         59     3 dead     cerebrovascular        78    118 normal activity   
 2        317     3 dead     cerebrovascular        75    145 normal activity   
 3         16     3 dead     heart or vascular      87     81 in bed < 50% dayt…
 4        451     3 dead     heart or vascular      72    101 normal activity   
 5        325     3 dead     respiratory disease    75     96 normal activity   
 6         52     3 dead     heart or vascular      76     98 normal activity   
 7        186     3 alive    <NA>                   67    109 normal activity   
 8        144     4 dead     heart or vascular      73    103 normal activity   
 9        370     3 alive    <NA>                   80     99 normal activity   
10         49     3 alive    <NA>                   49     99 normal activity   
# ℹ 13 more variables: medical_history <lgl>, study_date <date>,
#   follow_up_months <dbl>, sys_blood_pressure <dbl>, dia_blood_pressure <dbl>,
#   ekg <chr>, serum_hemoglobin <dbl>, size_of_primary_tumor <dbl>,
#   combined_index <dbl>, serum_prostatic_acid_phosphatase <dbl>,
#   bone_metastases <lgl>, treatment <chr>, dosage <dbl>

Filtering out extreme AP values

Its noted on the Vanderbilt repo that some entries have extremely high AP values. Unfortunantly the explanation is no longer available. For this reason we choose to remove these rows entirely:

full_df |>
  ggplot(aes(x=serum_prostatic_acid_phosphatase)) +
  geom_histogram(binwidth = 100) +
  labs(
    title = "Distribution of AP values"
  ) +
  best_theme()

Based on the plot we discard all entries with AP values above 250

cutoff <- 250
full_df <- full_df |> 
  filter(serum_prostatic_acid_phosphatase <= cutoff)
full_df |> slice_sample(n = 10)
# A tibble: 10 × 20
   patient_no stage is_alive cause_of_death       age weight performance        
        <dbl> <dbl> <chr>    <chr>              <dbl>  <dbl> <chr>              
 1        457     4 alive    <NA>                  77    106 normal activity    
 2        405     4 dead     unspecified non-ca    72     86 in bed < 50% dayti…
 3         51     3 dead     prostatic ca          73    101 normal activity    
 4        115     4 dead     cerebrovascular       78    103 in bed < 50% dayti…
 5        187     3 alive    <NA>                  74    103 normal activity    
 6        108     4 dead     other ca              76     92 normal activity    
 7        439     3 alive    <NA>                  71    105 normal activity    
 8        308     3 dead     prostatic ca          74     97 normal activity    
 9        458     4 alive    <NA>                  71    101 normal activity    
10         24     4 dead     prostatic ca          56     95 normal activity    
# ℹ 13 more variables: medical_history <lgl>, study_date <date>,
#   follow_up_months <dbl>, sys_blood_pressure <dbl>, dia_blood_pressure <dbl>,
#   ekg <chr>, serum_hemoglobin <dbl>, size_of_primary_tumor <dbl>,
#   combined_index <dbl>, serum_prostatic_acid_phosphatase <dbl>,
#   bone_metastases <lgl>, treatment <chr>, dosage <dbl>

Preparing data for modelling

Modelling tumor size vs other factors may pose an interesting investigation and see which factors may be important for tumor development in advanced stages. We need to first convert the columns that are not already numeric to a numeric representation. This can be done in a number of ways but to keep it similar we choose that the higher the number the worse for the potion. For example normal activity is assigned zero and low activity such as stuck in bed is assigned four.

df_num <- full_df |>
  mutate(is_alive = case_when(is_alive == "alive" ~ 0,
                              is_alive == "dead" ~ 1),
         performance = case_when(performance == "normal activity" ~ 0,
                                 performance == "in bed < 50% daytime" ~ 1,
                                 performance == "in bed > 50% daytime" ~ 2,
                                 performance == "confined to bed" ~ 4),
         medical_history = as.numeric(medical_history),
         ekg = case_when(ekg == "normal" ~ 0,
                         ekg == "benign" ~ 1,
                         ekg == "heart block or conduction def" ~ 2,
                         ekg == "rhythmic disturb & electrolyte ch" ~ 3,
                         ekg == "old MI" ~ 4,
                         ekg == "recent MI" ~ 5,
                         ekg == "rhythmic disturb & electrolyte ch" ~ 6,
                         ekg == "heart strain" ~ 7),
         bone_metastases = as.numeric(bone_metastases),
         treatment = case_when(treatment == "placebo" ~ 0,
                               treatment == "estrogen" ~ 1)
         )

Similarly to the example available at https://r4bds.github.io/lab06.html we need a long version of the data where current variable is stored as an entry.

df_long <- df_num |>
  select(c(patient_no,stage, age, weight, follow_up_months,
                        sys_blood_pressure, dia_blood_pressure,
                        serum_hemoglobin, serum_prostatic_acid_phosphatase,
                        combined_index, size_of_primary_tumor, is_alive, performance, medical_history,ekg, bone_metastases, treatment, dosage)) |> 
  pivot_longer(cols = c(stage, age, weight, follow_up_months,
                        sys_blood_pressure, dia_blood_pressure,
                        serum_hemoglobin, serum_prostatic_acid_phosphatase,
                        combined_index, is_alive, performance, medical_history,ekg, bone_metastases, treatment, dosage),
               names_to = "variable",
               values_to = "value")

print(df_long |> slice_sample(n = 10))
# A tibble: 10 × 4
   patient_no size_of_primary_tumor variable           value
        <dbl>                 <dbl> <chr>              <dbl>
 1        376                    36 follow_up_months     2  
 2         63                    51 is_alive             1  
 3        419                    14 ekg                  7  
 4        114                     1 performance          0  
 5        490                     5 serum_hemoglobin    13.7
 6        378                    26 weight             106  
 7        385                     0 combined_index      13  
 8        424                    16 performance          0  
 9         74                    NA dosage               0.2
10        404                    46 dia_blood_pressure   8  

For analysis nesting variables is required.

df_long_nested <- df_long |>
  group_by(variable) |>
  nest()
df_long_nested |> head(10) |> print() #quick check
# A tibble: 10 × 2
# Groups:   variable [10]
   variable                         data              
   <chr>                            <list>            
 1 stage                            <tibble [496 × 3]>
 2 age                              <tibble [496 × 3]>
 3 weight                           <tibble [496 × 3]>
 4 follow_up_months                 <tibble [496 × 3]>
 5 sys_blood_pressure               <tibble [496 × 3]>
 6 dia_blood_pressure               <tibble [496 × 3]>
 7 serum_hemoglobin                 <tibble [496 × 3]>
 8 serum_prostatic_acid_phosphatase <tibble [496 × 3]>
 9 combined_index                   <tibble [496 × 3]>
10 is_alive                         <tibble [496 × 3]>

Save this data

This data is saved as .Rdata to ensure that types are carried over

save(df_long_nested, df_num, file = "../data/04_data_for_modelling.Rdata")

Augment data for treatment class

On top of nesting variables, a data object with attributed treatment class is made. The MAP (Mean Arterial Pressure) variable is added to this set. MAP is the average blood pressure in a person’s arteries during one cardiac cycle (one heartbeat).

augment_data <- full_df |>
  mutate(MAP = (sys_blood_pressure+ 2 *dia_blood_pressure) / 3,
         is_alive = case_when(is_alive == "alive" ~ 0,
                              is_alive == "dead" ~ 1),
         treatment_detail = case_when(
           treatment == "placebo" ~ "Placebo",
           treatment == "estrogen" & dosage == 0.2 ~ "0.2mg Estrogen",
           treatment == "estrogen" & dosage == 1.0 ~ "1.0mg Estrogen",
           treatment == "estrogen" & dosage == 5.0 ~ "5.0mg Estrogen",
           TRUE ~ NA_character_),
         treatment_detail = factor(treatment_detail, 
                                   levels = c("Placebo", 
                                              "0.2mg Estrogen", 
                                              "1.0mg Estrogen", 
                                              "5.0mg Estrogen")),)

augment_data |> slice_sample(n = 10)
# A tibble: 10 × 22
   patient_no stage is_alive cause_of_death      age weight performance    
        <dbl> <dbl>    <dbl> <chr>             <dbl>  <dbl> <chr>          
 1        310     3        1 cerebrovascular      74     95 normal activity
 2        344     3        1 heart or vascular    76    102 normal activity
 3        407     4        1 prostatic ca         78     95 normal activity
 4         82     3        1 heart or vascular    79    106 normal activity
 5        235     4        0 <NA>                 53    116 normal activity
 6        155     4        1 prostatic ca         67    111 normal activity
 7         39     4        1 prostatic ca         53     99 normal activity
 8        446     3        0 <NA>                 63     95 normal activity
 9        245     3        1 other ca             75     81 normal activity
10        438     3        0 <NA>                 73     84 normal activity
# ℹ 15 more variables: medical_history <lgl>, study_date <date>,
#   follow_up_months <dbl>, sys_blood_pressure <dbl>, dia_blood_pressure <dbl>,
#   ekg <chr>, serum_hemoglobin <dbl>, size_of_primary_tumor <dbl>,
#   combined_index <dbl>, serum_prostatic_acid_phosphatase <dbl>,
#   bone_metastases <lgl>, treatment <chr>, dosage <dbl>, MAP <dbl>,
#   treatment_detail <fct>

Saving treatment class data

data_dir <- "../data/"
aug_tsv_file <- "03_dat_aug.tsv.gz"
augment_data |> write_tsv(file = str_c(data_dir,aug_tsv_file))

Description

Load libraries

library("tidyverse")
library("viridis")
library("ggridges")
source("99_proj_func.R")

Data description

This dataset is from Green and Bryar who have analysed data from a randomized clinical trial of prostate cancer paitients in stage 3 and 4. In this project the same data will be analysed

Meta data

Below is the full prostate data set after tidying.

prostate_tidy <- read_tsv("../data/02_dat_tidy.tsv.gz")
prostate_tidy
# A tibble: 502 × 20
   patient_no stage is_alive cause_of_death      age weight performance         
        <dbl> <dbl> <chr>    <chr>             <dbl>  <dbl> <chr>               
 1          1     3 alive    <NA>                 75     76 normal activity     
 2          2     3 dead     other ca             54    116 normal activity     
 3          3     3 dead     cerebrovascular      69    102 normal activity     
 4          4     3 dead     cerebrovascular      75     94 in bed < 50% daytime
 5          5     3 alive    <NA>                 67     99 normal activity     
 6          6     3 dead     prostatic ca         71     98 normal activity     
 7          7     3 dead     heart or vascular    75    100 normal activity     
 8          8     3 alive    <NA>                 73    114 normal activity     
 9          9     3 alive    <NA>                 60    110 normal activity     
10         10     3 alive    <NA>                 78    107 normal activity     
# ℹ 492 more rows
# ℹ 13 more variables: medical_history <lgl>, study_date <date>,
#   follow_up_months <dbl>, sys_blood_pressure <dbl>, dia_blood_pressure <dbl>,
#   ekg <chr>, serum_hemoglobin <dbl>, size_of_primary_tumor <dbl>,
#   combined_index <dbl>, serum_prostatic_acid_phosphatase <dbl>,
#   bone_metastases <lgl>, treatment <chr>, dosage <dbl>

It contains data from 502 patients with each 20 variables that have been kept track of. The most important in this instance is; stage, combined stage index, cause of death, treatment, dosage and clinical measurements. These include: weight, age, medical history, systolic/diastolic blood pressure, ekg, serum hemoglobin, primary tumor size in cm2, serum prostatic acid phosphatase (PAP) and bone metastasis indicator. Distribution of these various parameters can be seen below.

Stage distribution

(prostate_tidy |>
  mutate(stage = factor(stage),
         is_alive = factor(is_alive)) |>
  ggplot(aes(
    x = stage,
    fill = is_alive)) +
  geom_bar(
    color = 'black',
    position = "dodge",
    alpha = 0.4) +
  geom_hline(yintercept = 0) +
  theme_minimal() +
  theme(legend.position = "right",
        axis.text.x = element_text(angle = 10, hjust = 1, size = 10, vjust = 1.5), 
        axis.text.y = element_text(size = 6)) + 
  labs(x = "Stage", y = "Count", fill = "Status")) |> 
save_and_show("status_comparison_hist")
Saving 7 x 5 in image

prostate_tidy |>
  group_by(stage, is_alive) |>
  summarize(n = n())
# A tibble: 4 × 3
# Groups:   stage [2]
  stage is_alive     n
  <dbl> <chr>    <int>
1     3 alive       96
2     3 dead       193
3     4 alive       52
4     4 dead       161

As mentioned the dataset consists of 502 patients, as seen above. 96 of were in stage 3 and alive while 193 of had died. For stage for 4, 52 were alive and 161 patients had died.

full_df <- prostate_tidy |> mutate(stage = factor(stage, levels = c(3,4)))

full_df |>
  ggplot(aes(x = stage, y = size_of_primary_tumor)) +
  geom_boxplot(outlier.shape = NA) +      # hide the outliners
  geom_jitter(width = 0.15, alpha = 0.6) + 
  coord_cartesian(ylim = c(0, 40)) +      # focus on the main part
  labs(
    x = "Clinical stage",
    y = "Size of primary tumor",
    title = "Tumor size across clinical stages"
  ) +
  theme_minimal()+
  theme(plot.title = element_text(hjust = 0.5) # keep titles in the middle
  )

A monotonic increase in tumor size across stages suggests that tumor size is a useful measure of disease progression and may influence survival risk.

Treatment groups

(prostate_tidy |>
  mutate(stage = factor(stage),
         dosage = factor(dosage)) |>
  ggplot(aes(x = stage,
         fill = dosage)) +
  geom_bar(
    color = 'black',
    position = "dodge",
    alpha = 0.4) +
  geom_hline(yintercept = 0) +
  scale_fill_viridis_d(
    option = "D",
    labels = function(x) paste0(x, " mg")
    ) +
  theme_minimal() +
  theme(legend.position = "right",
        axis.text.x = element_text(angle = 10, hjust = 1, size = 10, vjust = 1.5), 
        axis.text.y = element_text(size = 6)) + 
  labs(x = "Stage", y = "Count", fill = "Dosage")) |> 
save_and_show("dosage_dist_by_stage")
Saving 7 x 5 in image

Tumor size distribution

We can see there the treatment groups are very evenly distributed.

prostate_tidy |>
  mutate(is_alive = factor(is_alive),
         stage = factor(stage)) |>
  ggplot(aes(x = size_of_primary_tumor, y = is_alive, fill = is_alive)) +
  geom_density_ridges(alpha = 0.6, color = "black") +
  facet_wrap(~stage) +
  scale_fill_manual(
    values = c("alive" = "#FCFDBFFF", "dead" = "#000004FF")) +
  labs(title = "Tumor size stratified by status and stage", 
       x = bquote('Tumor size in' ~ cm^2), y = "Status") +
  theme_minimal(base_family = "lato") +
  theme(legend.position = "none")
Picking joint bandwidth of 2.61
Picking joint bandwidth of 4.39
Warning: Removed 5 rows containing non-finite outside the scale range
(`stat_density_ridges()`).

Causes of death

Most importantly we also want to investigate the most prevelant causes of death.

prostate_tidy |>
  mutate(cause_of_death = factor(cause_of_death)) |>
  group_by(cause_of_death) |>
  filter(!is.na(cause_of_death)) |> # for alive
  summarize(n = n())
# A tibble: 9 × 2
  cause_of_death            n
  <fct>                 <int>
1 cerebrovascular          31
2 heart or vascular        96
3 other ca                 25
4 other specific non-ca    28
5 prostatic ca            130
6 pulmonary embolus        14
7 respiratory disease      16
8 unknown cause             7
9 unspecified non-ca        7

As expected the leading cause of death of individuals in the study were prostatic cancer or other types, with heart and vascular also have a prominent representation.

Density of measurements: AP and Serum hemoglobin

(prostate_tidy |>  
  pivot_longer(
    cols = c(serum_prostatic_acid_phosphatase, serum_hemoglobin),
    names_to = "variable",
    values_to = "value") |>
  ggplot(aes(x = value, fill = variable)) +
  geom_density(alpha = 0.6) + 
  coord_cartesian(xlim = c(0, 20), ylim = c(0, 0.5)) +
  scale_fill_manual(
    values = c(
      "serum_prostatic_acid_phosphatase" = "salmon",
      "serum_hemoglobin" = "skyblue"),
    labels = c(
      "serum_prostatic_acid_phosphatase" = "PAP",
      "serum_hemoglobin" = "Serum Hemoglobin")) +
  labs(x = "Value", fill = "Variable") +
  best_theme()) |> 
save_and_show("distribution_of_sh_and_pap")
Saving 7 x 5 in image

We see the vast majority of the variables are within the bottom range of respectively 0-5 for AP and 8-20 for Serum hemoglobin. Worth noting is that present in the data set are several outliers who show very high measurements of AP, up to 900+.

Analysis

library("tidyverse")
library("readr")
library("broom")
source("99_proj_func.R") # custom functions and theme

Load data

We load the data that was augmented for modelling:

load("../data/04_data_for_modelling.Rdata")

Modelling tumor size as a function of variables

Firstly a preliminary analysis of linear model of variables against tumor size as an indicator of disease progression is done.

df_long_nested_model <- df_long_nested |>
  mutate(model_object = map(data,
                            ~lm(size_of_primary_tumor ~ value,
                                data = .x))) |> 
  mutate(tidy_model = map(model_object, ~tidy(.x,
                                              conf.int = TRUE,
                                              conf.level = 0.95))) |> 
  unnest(cols = tidy_model) |> 
  filter(term == "value") |>  #discard intercept terms
  mutate(p_adjusted = p.adjust(p.value)) |> # correct for multiple testing
  arrange(p_adjusted) #sort pvalues to reveal highest significance

df_long_nested_model |> 
  select(-data, -model_object, -term, -std.error, -statistic) |>  
  print()
# A tibble: 16 × 6
# Groups:   variable [16]
   variable                      estimate  p.value conf.low conf.high p_adjusted
   <chr>                            <dbl>    <dbl>    <dbl>     <dbl>      <dbl>
 1 combined_index                 2.32    7.08e-18   1.81      2.83     7.08e-18
 2 stage                          6.71    1.67e- 9   4.57      8.86     1.67e- 9
 3 bone_metastases                8.97    2.90e- 9   6.06     11.9      2.90e- 9
 4 serum_prostatic_acid_phospha…  0.116   2.63e- 5   0.0623    0.170    2.63e- 5
 5 follow_up_months              -0.0955  5.96e- 5  -0.142    -0.0492   5.96e- 5
 6 is_alive                       4.43    2.95e- 4   2.04      6.81     2.95e- 4
 7 medical_history               -2.66    1.84e- 2  -4.87     -0.450    1.84e- 2
 8 serum_hemoglobin              -0.663   2.03e- 2  -1.22     -0.103    2.03e- 2
 9 performance                    2.68    4.24e- 2   0.0922    5.28     4.24e- 2
10 dia_blood_pressure            -0.467   2.24e- 1  -1.22      0.286    2.24e- 1
11 weight                        -0.0472  2.63e- 1  -0.130     0.0355   2.63e- 1
12 sys_blood_pressure             0.222   3.37e- 1  -0.231     0.675    3.37e- 1
13 ekg                            0.158   4.20e- 1  -0.226     0.542    4.20e- 1
14 dosage                        -0.137   6.21e- 1  -0.679     0.406    6.21e- 1
15 treatment                     -0.161   9.00e- 1  -2.68      2.35     9.00e- 1
16 age                            0.00851 9.15e- 1  -0.148     0.165    9.15e- 1

Based on these models it appears that most variables have an independent correlation with tumor size. This will be illustrated below.

Volcano plot

A volcano plot shows our estimate slope vs the p-value.

(df_long_nested_model |> 
  ggplot(mapping = aes(x = estimate,
                     y = -log10(p_adjusted),
                     color = factor(p_adjusted < 0.05))) +
  geom_point(alpha = 0.9) +
  geom_hline(yintercept = 0)+
  geom_vline(xintercept = 0)+
  best_theme() +
  theme(legend.position = "none") +
  ggrepel::geom_text_repel(aes(label = case_when(p_adjusted < 0.05 ~ variable,
                                                 TRUE ~ ""))) +
  labs(
    title = "Estimate vs -log10 of adjusted p-value",
    y = "-log10 of p-values adjusted for multiple testing",
    x = "estimate")) |>
    save_and_show("linear_volcano")
Saving 7 x 5 in image

Forest plot

The forest plot shows the confidence interval of the modeled slope for all variables

(forest_plot <- df_long_nested_model |> 
  filter(p_adjusted < 0.05) |> 
  ggplot(mapping = aes(x = estimate,
                       y = fct_reorder(variable, estimate),#sort by the slope 
                       xmin = conf.low,
                       xmax = conf.high)) +
  geom_errorbar(orientation = "y") +
  geom_point() +
  geom_vline(xintercept = 0) +
  theme_minimal(base_size = 12) +
  labs(
    x = "Estimates (95% CIs)",
    y = "Variable",
    title = "Variables with significant impact on tumor size"
  ) +
  best_theme()) |>
  save_and_show("linear_forest_plot")
Saving 7 x 5 in image

Residual QQ-plots

Here we seek to validate if the linear models make sense of if there is a depature from expected normality.

df_resids <- df_long_nested_model |> 
  mutate(aug = map(model_object, broom::augment)) |> 
  unnest(aug)
(ggplot(df_resids, aes(sample = .resid)) +
  stat_qq() +
  stat_qq_line() +
  facet_wrap(~ variable, scales = "free") +
  theme_minimal()) |>
  save_and_show("linear_qq")
Saving 7 x 5 in image

Evident from the QQ-plots are a distinct departure from the normal-distribution in tails. This is present in each model and therefore it is likely that tumor size has outliers. This is investigated following by a histogram and QQ-plot.

(df_num |> 
  ggplot() +
  geom_histogram(aes(x = size_of_primary_tumor), 
                 color = "black", alpha = 0.8, fill = "grey") +
  best_theme()) |> 
save_and_show("tumor_histogram")
Saving 7 x 5 in image
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 5 rows containing non-finite outside the scale range
(`stat_bin()`).
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Warning: Removed 5 rows containing non-finite outside the scale range
(`stat_bin()`).

(df_num |>  
  ggplot(aes(sample = size_of_primary_tumor)) +
  stat_qq() +
  stat_qq_line(color = "salmon") +
  best_theme()) |>
save_and_show("tumor_qq")
Saving 7 x 5 in image
Warning: Removed 5 rows containing non-finite outside the scale range
(`stat_qq()`).
Warning: Removed 5 rows containing non-finite outside the scale range
(`stat_qq_line()`).
Warning: Removed 5 rows containing non-finite outside the scale range
(`stat_qq()`).
Warning: Removed 5 rows containing non-finite outside the scale range
(`stat_qq_line()`).

These two plots confirm a definite departure from normality, indicative that log-transformed data would make a better fit.

(df_num |>
  filter(size_of_primary_tumor > 0) |>
  ggplot(aes(sample = log(size_of_primary_tumor))) +
  stat_qq() +
  stat_qq_line(color = "salmon") +
  best_theme()) |> 
save_and_show("log_tumor_qq")
Saving 7 x 5 in image

Evident from the model is a much closer normal distribution even if an S-shape is present this is likely due to a few outliers, but overall it reveals a logarithmic scale may suit the data better.

Modelling based on log transformed data

As just confirmed it is to be investigated wether a log-transformation of our data improves correlations. This requires dropping patients with tumor size zero.

df_long_log <- df_long_nested |> 
  unnest(cols = c(data)) |>
  filter(size_of_primary_tumor > 0)

df_long_log |> slice_sample(n = 10)
# A tibble: 160 × 4
# Groups:   variable [16]
   variable patient_no size_of_primary_tumor value
   <chr>         <dbl>                 <dbl> <dbl>
 1 age             349                     8    70
 2 age             376                    36    77
 3 age              42                    35    NA
 4 age             189                     4    73
 5 age             293                    12    74
 6 age              90                    13    76
 7 age             107                    14    70
 8 age              15                    15    73
 9 age             321                    18    75
10 age              12                     6    74
# ℹ 150 more rows

Nesting the model objects for cleanliness.

df_long_log_nest <- df_long_log |>
  group_by(variable) |>
  nest()

Model of logarithmic tumor size:

df_long_nested_model_log <- df_long_log_nest |>
  mutate(model_object = map(data,
                            ~lm(log(size_of_primary_tumor) ~ value,
                                data = .x))) |> 
  mutate(tidy_model = map(model_object, ~tidy(.x,
                                              conf.int = TRUE,
                                              conf.level = 0.95))) |> 
  unnest(cols = tidy_model) |> 
  filter(term == "value") |>  #discard intercept terms
  mutate(p_adjusted = p.adjust(p.value)) |> # correct for multiple testing
  arrange(p_adjusted) #sort pvalues to reveal highest significance

df_long_nested_model_log |> 
  select(-data, -model_object, -term, -std.error, -statistic) |>  
  print()
# A tibble: 16 × 6
# Groups:   variable [16]
   variable                      estimate  p.value conf.low conf.high p_adjusted
   <chr>                            <dbl>    <dbl>    <dbl>     <dbl>      <dbl>
 1 combined_index                 1.58e-1 6.12e-14  0.118     0.198     6.12e-14
 2 stage                          4.69e-1 4.28e- 8  0.303     0.634     4.28e- 8
 3 bone_metastases                6.16e-1 8.95e- 8  0.393     0.839     8.95e- 8
 4 serum_prostatic_acid_phospha…  8.46e-3 5.56e- 5  0.00437   0.0126    5.56e- 5
 5 is_alive                       2.95e-1 1.69e- 3  0.111     0.478     1.69e- 3
 6 follow_up_months              -5.63e-3 2.10e- 3 -0.00921  -0.00205   2.10e- 3
 7 serum_hemoglobin              -5.41e-2 1.34e- 2 -0.0969   -0.0113    1.34e- 2
 8 medical_history               -2.11e-1 1.44e- 2 -0.380    -0.0423    1.44e- 2
 9 sys_blood_pressure             1.93e-2 2.73e- 1 -0.0153    0.0540    2.73e- 1
10 dosage                        -2.27e-2 2.85e- 1 -0.0643    0.0190    2.85e- 1
11 dia_blood_pressure            -2.73e-2 3.50e- 1 -0.0848    0.0301    3.50e- 1
12 ekg                            1.15e-2 4.45e- 1 -0.0180    0.0409    4.45e- 1
13 performance                    7.31e-2 4.68e- 1 -0.125     0.271     4.68e- 1
14 treatment                     -3.26e-2 7.40e- 1 -0.225     0.160     7.40e- 1
15 age                           -1.40e-3 8.18e- 1 -0.0133    0.0105    8.18e- 1
16 weight                         1.01e-4 9.75e- 1 -0.00624   0.00644   9.75e- 1

If we compare these p-values with earlier linear model we see a much more significant correlation between these and log-transformed tumor size. This will be validated with residual QQ-plots.

Preparing for residual plots

df_resids_log <- df_long_nested_model_log |> 
  mutate(aug = map(model_object, 
                   broom::augment)) |> 
  unnest(aug)
(ggplot(df_resids_log, aes(sample = .resid)) +
  stat_qq() +
  stat_qq_line() +
  facet_wrap(~ variable, scales = "free") +
  theme_minimal()) |> 
save_and_show("log_qq")
Saving 7 x 5 in image

Aligning with our log-transformed tumor data the variables now also show a smaller from expected expectation normality.

Volcano plot

A volcano plot shows our estimate slope vs the p-value.

(
df_long_nested_model_log |> 
  ggplot(mapping = aes(x = estimate,
                     y = -log10(p_adjusted),
                     color = factor(p_adjusted < 0.05))) +
  geom_point(alpha = 0.9) +
  geom_hline(yintercept = 0)+
  geom_vline(xintercept = 0)+
  best_theme() +
  theme(legend.position = "none") +
  ggrepel::geom_text_repel(aes(label = case_when(p_adjusted < 0.05 ~ variable,
                                                 TRUE ~ ""))) +
  labs(
    title = "Estimate vs -log10 of adjusted p-value",
    y = "-log10 of p-values adjusted for multiple testing",
    x = "estimate")) |> 
save_and_show(plot_name = "log_volcano")
Saving 7 x 5 in image

Forest plot

The forest plot shows the confidence interval of the modelled slope for all variables

(df_long_nested_model_log |> 
  filter(p_adjusted < 0.05) |> 
  ggplot(mapping = aes(x = estimate,
                       y = fct_reorder(variable, estimate),#sort byr the skope
                       xmin = conf.low,
                       xmax = conf.high)) +
  geom_errorbar(orientation = "y") +
  geom_point() +
  geom_vline(xintercept = 0) +
  theme_minimal(base_size = 12) +
  labs(
    x = "Estimates (95% CIs)",
    y = "Variable",
    title = "Variables with significant impact on tumor size"
  ) +
  best_theme()) |> 
save_and_show("log_forest")
Saving 7 x 5 in image

Correlation among continuous clinical variables

Beyond comparison with tumor size a pairwise correlation for the quantitative biological and physiological measurements is computed to identify potential collinearity before survival analysis.

# Select relevant numeric variables
corr_df <- df_num |>
  select(
    age,
    weight,
    serum_hemoglobin,
    serum_prostatic_acid_phosphatase,
    size_of_primary_tumor)

# Compute raw correlation matrix
corr_mat <- cor(corr_df, use = "pairwise.complete.obs")

Correlation heatmap

# Convert matrix to long format for ggplot2
corr_long <- as.data.frame(corr_mat) |>
  rownames_to_column("Var1") |>
  pivot_longer(
    cols = -Var1,
    names_to = "Var2",
    values_to = "cor"
  )

corr_long |>
  ggplot(aes(x = Var2, y = Var1, fill = cor)) +
  geom_tile(color = "white") +
  geom_text(aes(label = sprintf("%.2f", cor)), size = 3) +
  scale_fill_gradient2(
    low = "blue",
    mid = "white",
    high = "red",
    midpoint = 0,
    limits = c(-1, 1),
    name = "Correlation"
  ) +
  theme_minimal(base_size = 12) +
  theme(
    axis.text.x = element_text(angle = 45, hjust = 1), # retate words
    panel.grid = element_blank() # remove grid lines
  ) +
  labs(
    title = "Correlation Matrix of Selected Variables",
    x = NULL,
    y = NULL
  )

This correlation matrix helps to identify redundant predictors. All the relevances is below 0.8, which indicates the selected variables can be seen as independent and used in a survival analysis. Given previous analysis it can also be concluded that shown variables

Exploratory multivariate analysis

Expanding on colliniarity and variables that have been shown significant in relation to log(size_of_primary_tumor) a few multivariate models are explored.

model_tidy <- lm(log(size_of_primary_tumor) ~ 
                   serum_hemoglobin + 
                   medical_history + 
                   bone_metastases + 
                   serum_prostatic_acid_phosphatase,
                 data = df_num |>
                   filter(size_of_primary_tumor > 0)) |> 
  tidy()
print(model_tidy)
# A tibble: 5 × 5
  term                             estimate std.error statistic  p.value
  <chr>                               <dbl>     <dbl>     <dbl>    <dbl>
1 (Intercept)                       2.57      0.310       8.30  1.09e-15
2 serum_hemoglobin                 -0.0206    0.0221     -0.930 3.53e- 1
3 medical_history                  -0.187     0.0837     -2.24  2.57e- 2
4 bone_metastases                   0.463     0.126       3.68  2.62e- 4
5 serum_prostatic_acid_phosphatase  0.00473   0.00225     2.10  3.61e- 2

Logistic regression predicting survivability (bad results)

A binary logistic regression is perfomed on the is_alive column. This is of limited biological relevanc but may reveal an interesting insight. interesting cols: select(is_alive, age, weight, size_of_primary_tumor,serum_prostatic_acid_phosphatase, performance)

# Turn df is_alive column into numeric variable for modelling:
df_a_ts <- df_num |>
  select(is_alive, size_of_primary_tumor) |> 
  drop_na()


df_a_ts |>
  ggplot(aes(y = is_alive,
             x = size_of_primary_tumor)) +
  geom_point() +
  labs(
    x = "Size of primary tumor",
    y = "Alive at time of study (1: YES, 0: NO)",
    title = "Being alive vs tumor size"
  ) +
  best_theme()

There seems to be a threshold for tumor size vs being alive at the time of study.

Fitting the logistic regression

my_glm_mdl <- glm(formula = is_alive ~ size_of_primary_tumor,
                  family = binomial(link = "logit"),
                  data = df_num)
my_glm_mdl

Call:  glm(formula = is_alive ~ size_of_primary_tumor, family = binomial(link = "logit"), 
    data = df_num)

Coefficients:
          (Intercept)  size_of_primary_tumor  
               0.4333                 0.0339  

Degrees of Freedom: 490 Total (i.e. Null);  489 Residual
  (5 observations deleted due to missingness)
Null Deviance:      592.4 
Residual Deviance: 578.1    AIC: 582.1
df_a_ts |> 
  mutate(my_glm_model = pluck(my_glm_mdl, "fitted.values")) |> 
  ggplot(aes(x = size_of_primary_tumor, y = is_alive)) +
  geom_point(alpha = 0.5,
             size = 3) +
  geom_line(aes(y = my_glm_model)) +
  labs(
    x = "Size of primary tumor",
    y = "Alive at time of study (0: YES, 1: NO)",
    title = "Being alive vs tumor size"
  ) +
  best_theme()

As mentioned there seems to be a certain correlation between these two but it is not a very telling analysis so it should not be considered.

Multivariate on measurements – No significant conclusions

Mentioned preivously this is an exploration of blood and treatment data to evaluate whether it could have an explanatory effect. This is conducted due to QQ-plots showing some variation and S-tailing. First explored is the tumor size, and potential effects of treatment, AP and hemoglobin.

ap_ts_log_tidy_model <- lm(
  log(size_of_primary_tumor) ~ 
    log(serum_prostatic_acid_phosphatase) +
    serum_prostatic_acid_phosphatase +
    serum_hemoglobin + 
    dosage + 
    bone_metastases +
    stage,
  data = df_num |> 
    filter(size_of_primary_tumor > 0))

print(ap_ts_log_tidy_model |>
        tidy())
# A tibble: 7 × 5
  term                                  estimate std.error statistic     p.value
  <chr>                                    <dbl>     <dbl>     <dbl>       <dbl>
1 (Intercept)                            3.06      0.555       5.50      6.05e-8
2 log(serum_prostatic_acid_phosphatase)  0.242     0.0543      4.46      1.01e-5
3 serum_prostatic_acid_phosphatase      -0.00546   0.00298    -1.83      6.76e-2
4 serum_hemoglobin                      -0.0268    0.0217     -1.24      2.17e-1
5 dosage                                -0.0297    0.0200     -1.48      1.39e-1
6 bone_metastases                        0.261     0.136       1.92      5.51e-2
7 stage                                 -0.118     0.132      -0.892     3.73e-1

Disregarding non-significant values:

ap_ts_log_tidy_model |> 
  tidy() |>
  mutate(padj = p.adjust(p.value, method = "fdr")) |>
  filter(padj < 0.05) |>
  save_and_show_table("log_tumor_sig_fit")
# A tibble: 2 × 6
  term                              estimate std.error statistic p.value    padj
  <chr>                                <dbl>     <dbl>     <dbl>   <dbl>   <dbl>
1 (Intercept)                          3.06     0.555       5.50 6.05e-8 4.23e-7
2 log(serum_prostatic_acid_phospha…    0.242    0.0543      4.46 1.01e-5 3.55e-5

Seems that log(AP) does have a significant correlation with tumor size but not more than the log tumor size.

Exploratory analysis of effects on prostatic acid phosphatase

ap_es_log_tidy_model <- lm(
  serum_prostatic_acid_phosphatase ~ 
    dosage +
    I(dosage^2) +
    I(dosage^2)*serum_hemoglobin +
    bone_metastases*dosage +
    log(size_of_primary_tumor),
  data = df_num |> filter(size_of_primary_tumor > 0))

print(ap_es_log_tidy_model |> tidy())
# A tibble: 8 × 5
  term                         estimate std.error statistic       p.value
  <chr>                           <dbl>     <dbl>     <dbl>         <dbl>
1 (Intercept)                    9.16      7.55      1.21   0.226        
2 dosage                         0.259     2.77      0.0933 0.926        
3 I(dosage^2)                    1.30      0.802     1.62   0.106        
4 serum_hemoglobin              -0.794     0.516    -1.54   0.124        
5 bone_metastases               19.6       3.18      6.16   0.00000000156
6 log(size_of_primary_tumor)     2.00      0.918     2.17   0.0302       
7 I(dosage^2):serum_hemoglobin  -0.0951    0.0430   -2.21   0.0274       
8 dosage:bone_metastases        -0.341     1.15     -0.298  0.766        

Again filtering for non-significant contributors.

ap_es_log_tidy_model |> 
  tidy() |>
  mutate(padj = p.adjust(p.value, method = "fdr")) |>
  filter(padj < 0.05) |>
  print()
# A tibble: 1 × 6
  term            estimate std.error statistic       p.value         padj
  <chr>              <dbl>     <dbl>     <dbl>         <dbl>        <dbl>
1 bone_metastases     19.6      3.18      6.16 0.00000000156 0.0000000125

We see the primary contributors to the AP levels to be bone metastasis, a relevant conclusion as this is prevalent in advanced diagnosis.

Exploratory analysis of effects on bone metastasis

We wanted to investigate the correlation between bone metastatis and blood factors.

bm_ap_hb_model <- lm(
  bone_metastases ~
    serum_prostatic_acid_phosphatase*I(dosage**2) +
    serum_prostatic_acid_phosphatase*serum_hemoglobin +
    dosage*serum_hemoglobin,
  data = df_num)

print(bm_ap_hb_model |> tidy())
# A tibble: 8 × 5
  term                                      estimate std.error statistic p.value
  <chr>                                        <dbl>     <dbl>     <dbl>   <dbl>
1 (Intercept)                                6.78e-1 0.129         5.25  2.23e-7
2 serum_prostatic_acid_phosphatase          -1.22e-2 0.00363      -3.35  8.75e-4
3 I(dosage^2)                               -1.87e-2 0.00902      -2.08  3.83e-2
4 serum_hemoglobin                          -4.68e-2 0.00950      -4.92  1.18e-6
5 dosage                                     9.47e-2 0.0695        1.36  1.74e-1
6 serum_prostatic_acid_phosphatase:I(dosag… -2.66e-5 0.0000654    -0.407 6.84e-1
7 serum_prostatic_acid_phosphatase:serum_h…  1.81e-3 0.000336      5.38  1.14e-7
8 serum_hemoglobin:dosage                    9.84e-4 0.00391       0.252 8.01e-1
bm_ap_hb_model |> 
  tidy() |>
  mutate(padj = p.adjust(p.value, method = "fdr")) |>
  filter(padj < 0.05) |>
  save_and_show_table("bone_sig_fit")
# A tibble: 4 × 6
  term                              estimate std.error statistic p.value    padj
  <chr>                                <dbl>     <dbl>     <dbl>   <dbl>   <dbl>
1 (Intercept)                        0.678    0.129         5.25 2.23e-7 8.90e-7
2 serum_prostatic_acid_phosphatase  -0.0122   0.00363      -3.35 8.75e-4 1.75e-3
3 serum_hemoglobin                  -0.0468   0.00950      -4.92 1.18e-6 3.14e-6
4 serum_prostatic_acid_phosphatase…  0.00181  0.000336      5.38 1.14e-7 8.90e-7

Here we can confirm that bone metastasis is indeed influenced by both AP and hemoglobin levels but not estrogen treatment dosage.

AP levels by bone metastasis status

As just shown serum AP is commonly elevated in advanced prostate cancer and is often used as a surrogate marker for metastatic activity. We compare AP levels between patients with and without bone metastasis and draw a boxplot to show the result.

df_num |>
  ggplot(aes(
    x = factor(bone_metastases), 
    y = serum_prostatic_acid_phosphatase)) +
  geom_boxplot() +
  scale_y_log10() + # for log scale
  labs(
    x = "Bone metastases",
    y = "AP (log scale)",
    title = "AP levels (log scale) by bone metastases status"
  )+
  theme(plot.title = element_text(hjust = 0.5) # keep titles in the middle
  )

We can see that AP levels are expected to be substantially higher in patients with bone metastases, reinforcing AP as a marker of advanced disease. This observation is consistent with the extreme AP values identified earlier in 04_describe.

Interpretation

In summary, several variables have been confirmed to be significantly associated with tumor size, and therefore with disease severity. In addition, correlations between key variables such as metastasis and AP levels have been confirmed. Along with this confirmation correlations for use in subsequent survival analysis have been confirmed. Blood-derived factors showed the expected association with the presence of bone metastases, but they did not show significant correlation with each other, nor with treatment.

Survial Analysis

library("tidyverse")
library("survival")
library("broom")
source("99_proj_func.R")

Load dataset

In the previous steps, the original prostate dataset was downloaded and cleaned, seraval other necessary variables were added, and the resulting dataset was saved as 03_dat_aug.tsv.gz. The analysis of that dataset will proceed in this section.

full_df <- read_tsv("../data/03_dat_aug.tsv.gz")

# check the data we loaded
full_df |> slice_sample(n = 10)
# A tibble: 10 × 22
   patient_no stage is_alive cause_of_death          age weight performance    
        <dbl> <dbl>    <dbl> <chr>                 <dbl>  <dbl> <chr>          
 1        149     4        1 other specific non-ca    83     88 normal activity
 2        358     3        1 heart or vascular        62     85 normal activity
 3        169     4        0 <NA>                     78     99 normal activity
 4        302     3        1 prostatic ca             60    122 normal activity
 5        428     4        1 prostatic ca             56    100 normal activity
 6        329     3        1 other specific non-ca    71     81 normal activity
 7        224     4        0 <NA>                     70     92 normal activity
 8        211     3        0 <NA>                     78    123 normal activity
 9        147     4        1 prostatic ca             71     77 normal activity
10         59     3        1 cerebrovascular          78    118 normal activity
# ℹ 15 more variables: medical_history <lgl>, study_date <date>,
#   follow_up_months <dbl>, sys_blood_pressure <dbl>, dia_blood_pressure <dbl>,
#   ekg <chr>, serum_hemoglobin <dbl>, size_of_primary_tumor <dbl>,
#   combined_index <dbl>, serum_prostatic_acid_phosphatase <dbl>,
#   bone_metastases <lgl>, treatment <chr>, dosage <dbl>, MAP <dbl>,
#   treatment_detail <chr>

Baseline comparison between treatment groups

Before conducting survival analyses, it is essential to evaluate whether the treatment groups differ at baseline. The original treatment variable contains four treatment arms (“0.2 mg estrogen”, “1.0 mg estrogen”, “5.0 mg estrogen”, “placebo”). For comparisons, we create a combined variable representing any treatment details then print the table.

full_df <- full_df |>
  mutate(
    treatment_detail = factor(treatment_detail, 
                              levels = c("Placebo", 
                                         "0.2mg Estrogen", 
                                         "1.0mg Estrogen", 
                                         "5.0mg Estrogen"))
  )

full_df |> 
  select(treatment_detail) |>
  table()
treatment_detail
       Placebo 0.2mg Estrogen 1.0mg Estrogen 5.0mg Estrogen 
           127            122            124            123 

Summary statistics by treatment group

Here we take a closer look at variables distribution within treatment groups.

full_df |>
  group_by(treatment_detail) |>
  summarise(
    n = n(),
    age_mean = mean(age, na.rm = TRUE),
    weight_mean = mean(weight, na.rm = TRUE),
    hemoglobin_mean = mean(serum_hemoglobin, na.rm = TRUE),
    ap_mean = mean(serum_prostatic_acid_phosphatase, na.rm = TRUE),
    tumor_size_mean = mean(size_of_primary_tumor, na.rm = TRUE),
    bm_rate = mean(bone_metastases, na.rm = TRUE),
    medical_history_rate = mean(medical_history, na.rm = TRUE)
    )
# A tibble: 4 × 9
  treatment_detail     n age_mean weight_mean hemoglobin_mean ap_mean
  <fct>            <int>    <dbl>       <dbl>           <dbl>   <dbl>
1 Placebo            127     71.7       100.             13.3    6.36
2 0.2mg Estrogen     122     70.9        99.5            13.4    4.43
3 1.0mg Estrogen     124     71.3        98.2            13.6    7.55
4 5.0mg Estrogen     123     71.9        98.3            13.6    7.46
# ℹ 3 more variables: tumor_size_mean <dbl>, bm_rate <dbl>,
#   medical_history_rate <dbl>

As we can see there are a few differences within each treatment group to be analysed.

Hypothesis tests for group differences

To ensure that patients in different treatment groups have similar health condition a statistic was conducted. Continuous variables were compared using ANOVA, and categorical variables using chi-square tests(these functions are from base R).

# Age
full_df |>
  oneway.test(age ~ treatment_detail,
              data = _)

    One-way analysis of means (not assuming equal variances)

data:  age and treatment_detail
F = 0.58027, num df = 3.00, denom df = 271.75, p-value = 0.6284
# Bone metastasis
chisq.test(table(full_df$bone_metastases, full_df$treatment_detail))

    Pearson's Chi-squared test

data:  table(full_df$bone_metastases, full_df$treatment_detail)
X-squared = 6.359, df = 3, p-value = 0.09539
# Serum AP
full_df |>
  oneway.test(serum_prostatic_acid_phosphatase ~ treatment_detail,
              data = _)

    One-way analysis of means (not assuming equal variances)

data:  serum_prostatic_acid_phosphatase and treatment_detail
F = 1.1677, num df = 3.00, denom df = 254.72, p-value = 0.3226
# Hemoglobin
full_df |>
  oneway.test(serum_hemoglobin ~ treatment_detail,
              data = _)

    One-way analysis of means (not assuming equal variances)

data:  serum_hemoglobin and treatment_detail
F = 0.67823, num df = 3.00, denom df = 272.82, p-value = 0.566

Significant differences between groups indicate that treatment allocation is not fully randomized with respect to disease burden or patient health status. Such imbalances motivate the need for multivariable survival modeling to adjust for these covariates when estimating treatment effects. All the p-values are above 0.05, which indicates our samples are evenly spread.

Interpretation

Baseline comparisons help establish whether survival differences are attributed to treatment itself or reflect initial imbalances. For example, higher serum AP or greater frequency of bone metastasis in one group would imply greater disease severity at baseline, which may independently explain poorer outcomes. The results above provide our data is suitable for the survival analyses presented in the following section.

Survival analysis

To evaluate whether treatment has an effect on time-to-event outcomes, we perform non-parametric survival analysis using Kaplan–Meier estimators and log-rank tests. The dataset provides follow-up time (follow_up_months) and a binary mortality indicator (event), which allows construction of survival objects. (Some of the analysis(like cox models) was done by the package survive, in those sections we’ll pay more attention to data visualization.)

Kaplan–Meier survival curves by treatment group

We begin by constructing a survival object:

surv_obj <- Surv(time = full_df$follow_up_months,
                 event = full_df$is_alive)

Next, we estimate the Kaplan–Meier curves for the four treatment groups: 0.2 mg estrogen vs 1.0mg estrogen vs 5.0mg estrogen vs placebo. We wrote function km_fit to calculated the survival possibility at each time point and then draw a step plot to the relationship of survival possibility and time.

# calculate KM
km_res <- km_fit(
  data = full_df,
  time = "follow_up_months",
  event = "is_alive",
  group = "treatment_detail"
)

# draw survival curve
(km_res |>
  ggplot(aes(x = time, y = surv, color = group)) +
  geom_step(linewidth = 1) +
  labs(
    x = "Follow-up months",
    y = "Survival probability",
    title = "Kaplan–Meier Curves"
  ) +
  theme_minimal() +
  theme(plot.title = element_text(hjust = 0.5) # keep titles in the middle
  )) |> save_and_show("surv_curv by treatment")
Saving 7 x 5 in image

The Kaplan–Meier curves illustrate how survival probabilities decline over time for patients receiving estrogen therapy versus placebo. The 1.0mg Estrogen group have a significantly larger survival probability than the other, which provides a brief insight that this can be considered as a effective treatment.

Log-rank test

A log-rank test formally assesses whether the survival curves of the two treatment groups differ.

logrank_test <- survdiff(surv_obj ~ treatment_detail, data = full_df)
logrank_test
Call:
survdiff(formula = surv_obj ~ treatment_detail, data = full_df)

                                  N Observed Expected (O-E)^2/E (O-E)^2/V
treatment_detail=Placebo        127       95     88.1     0.538     0.734
treatment_detail=0.2mg Estrogen 122       93     84.5     0.849     1.141
treatment_detail=1.0mg Estrogen 124       71     93.9     5.603     7.808
treatment_detail=5.0mg Estrogen 123       92     84.4     0.681     0.915

 Chisq= 7.8  on 3 degrees of freedom, p= 0.05 

The p-value is 0.04 (< 0.05), which indicates statistically significant differences in survival between the treatment groups. The results from the survival curves is in agreement with this indication.

Cox proportional hazards modeling

Kaplan–Meier curves provide an unadjusted comparison of survival between the treatment groups, but they do not account for baseline imbalances in disease severity or physiological status. To estimate treatment effects while controlling for potential confounders, a Cox proportional hazards model is applied. Both univariate and multivariable Cox regressions are considered to evaluate the association between clinical variables and mortality risk.

Univariate Cox models

First fitted are univariate Cox models for key predictors identified in Sections 2 and 3, including tumor burden (AP, tumor size, bone metastases), physiological status (hemoglobin, age, MAP), and clinical stage.

cox_vars <- c("treatment_detail",
              "age",
              "serum_hemoglobin",
              "serum_prostatic_acid_phosphatase",
              "size_of_primary_tumor",
              "bone_metastases",
              "stage",
              "MAP",
              "weight")

# merge the results
uni_df <- map_df(cox_vars, function(v) {
  formula <- as.formula(str_c("Surv(follow_up_months, is_alive) ~ ", v))
  model <- coxph(formula, data = full_df)
  tidy(model, exponentiate = TRUE, conf.int = TRUE) |>
    mutate(variable = v)
})

# rename for cleanner look
uni_df <- uni_df |>
  mutate(
    Variable = case_when(
      variable == "treatment_detail" & grepl("0\\.2mg", term) ~ "Estrogen 0.2mg (vs placebo)",
      variable == "treatment_detail" & grepl("1\\.0mg", term) ~ "Estrogen 1.0mg (vs placebo)",
      variable == "treatment_detail" & grepl("5\\.0mg", term) ~ "Estrogen 5.0mg (vs placebo)",
      variable == "stage" & term == "stage4" ~ "Stage IV (vs Stage III)",
      variable == "bone_metastases" ~ "Bone metastasis (yes vs no)",
      variable == "serum_hemoglobin" ~ "Hemoglobin",
      variable == "size_of_primary_tumor" ~ "Tumor size",
      variable == "serum_prostatic_acid_phosphatase" ~ "AP",
      variable == "MAP" ~ "MAP",
      variable == "age" ~ "Age",
      variable == "weight" ~ "Weight",
      TRUE ~ term
    ),
    HR = estimate,
    lower = conf.low,
    upper = conf.high,
    p_sig = p.value < 0.05,
    HR_color = ifelse(HR > 1, "orange", "blue"),
    label_color = ifelse(p_sig, "red", "black"),
    label_face = ifelse(p_sig, "bold", "plain")
  ) |>
  arrange(HR) |>
  mutate(Variable = factor(Variable, levels = Variable))

# draw forestplot
(uni_df |>
  ggplot(aes(x = HR, y = Variable)) +
  geom_point(aes(color = HR_color), size = 3) +
  geom_errorbarh(aes(xmin = lower, xmax = upper, color = HR_color), height = 0.25) +
  geom_vline(xintercept = 1, linetype = "dashed") +
  scale_color_identity() +
  scale_x_log10() +
  labs(
    title = "Univariate Cox Model",
    x = "Hazard Ratio (log scale)",
    y = ""
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.y = element_text(size = 12,
                               color = uni_df$label_color,
                               face = uni_df$label_face)
  )) |> save_and_show("Univariate_cox_model")
Saving 7 x 5 in image
Warning: Vectorized input to `element_text()` is not officially supported.
ℹ Results may be unexpected or may change in future versions of ggplot2.

In the forest plot, variables are marked with HR > 1 orange and those with HR < 1 blue. The variables with significant difference are shown red and emphasis their names. The univariate Cox analyses shows that Age, AP, tumor size, bone metastasis, and clinical stage were all associated with increased hazard of death, consistent with known clinical behavior of advanced prostate cancer. Hemoglobin was protective. Among treatment arms, only 1.0mg estrogen showed a significantly reduced hazard compared with placebo. MAP showed no significant association with survival.

Multivariable Cox model

Next a multivariable model incorporating treatment group and major clinical covariates is fitted. Predictors were chosen based on biological relevance and the exploratory findings from earlier analysis sections.

cox_multi <- coxph(
  Surv(follow_up_months, is_alive) ~
    treatment_detail +
    age +
    serum_hemoglobin +
    serum_prostatic_acid_phosphatase +
    size_of_primary_tumor +
    bone_metastases +
    stage +
    MAP+
    weight,
  data = full_df
  )

# clean the result
multi_df <- tidy(cox_multi, exponentiate = TRUE, conf.int = TRUE) |>
  mutate(
    HR = estimate,
    lower = conf.low,
    upper = conf.high,
    Variable = term,
    p_sig = p.value < 0.05,    # whether it's significant
    HR_color = ifelse(HR > 1, "orange", "blue"),   
    label_color = ifelse(p_sig, "red", "black"),   
    label_face = ifelse(p_sig, "bold", "plain")    
  ) |>
  # rename
  mutate(
    Variable = case_when(
      grepl("0\\.2mg", Variable) ~ "Estrogen 0.2mg (vs placebo)",
      grepl("1\\.0mg", Variable) ~ "Estrogen 1.0mg (vs placebo)",
      grepl("5\\.0mg", Variable) ~ "Estrogen 5.0mg (vs placebo)",
      Variable == "stage4" ~ "Stage IV (vs Stage III)",
      Variable == "bone_metastasesTRUE" ~ "Bone metastasis (yes vs no)",
      Variable == "serum_hemoglobin" ~ "Hemoglobin",
      Variable == "size_of_primary_tumor" ~ "Tumor size",
      Variable == "serum_prostatic_acid_phosphatase" ~ "AP",
      Variable == "MAP" ~ "MAP",
      Variable == "age" ~ "Age",
      Variable == "weight" ~ "Weight",
      TRUE ~ Variable
    )
  ) |>
  arrange(HR) |>                   # order
  mutate(Variable = factor(Variable, levels = Variable))

# draw forestplot
(multi_df |>
  ggplot(aes(x = HR, y = Variable)) +
  geom_point(aes(color = HR_color), size = 3) +
  geom_errorbarh(aes(xmin = lower, xmax = upper, color = HR_color),
                 height = 0.25) +
  geom_vline(xintercept = 1, linetype = "dashed") +
  scale_color_identity() +    
  scale_x_log10() +
  labs(
    title = "Multivariable Cox Model",
    x = "Hazard Ratio (log scale)",
    y = ""
  ) +
  theme_minimal(base_size = 14) +
  theme(
    axis.text.y = element_text(size = 12,
                               color = multi_df$label_color,
                               face = multi_df$label_face),
    plot.title = element_text(size = 18, face = "bold"),
    panel.grid.major.y = element_blank()
  )) |> save_and_show("Multivarible_cox_model")
Saving 7 x 5 in image
Warning: Vectorized input to `element_text()` is not officially supported.
ℹ Results may be unexpected or may change in future versions of ggplot2.

The forest plot is similar to the one shown in the univariate section. Indicative in the forest plot bone metastasis, tumor size, AP, and age were strong and statistically significant risk factors, while hemoglobin and the 1.0mg estrogen treatment remained significant protective factors. MAP had no independent prognostic value. Clinical stage lost significance after adjustment, likely due to the stronger explanatory power of AP and metastatic status. These findings indicate that tumor burden biomarkers are the dominant determinants of survival, and 1.0mg estrogen demonstrated a robust and independent therapeutic benefit.

Interpretation

The multivariable model allows us to draw refined conclusions about predictors of mortality(Bone metastasis, tumor size, AP, and age).

Outlier and high-risk patient exploration

Earlier discussion highlighted the presence of extreme values in serum acid phosphatase (AP), a biomarker commonly elevated in advanced or metastatic prostate cancer. In this section, outliers are identified, and their clinical characteristics are examined and evaluated based on their survival patterns. This should reveal clinically meaningful patient subgroups at disproportionately high risk.

Identifying extreme AP values

This begins by examining the upper tail of the AP distribution(using quantile).

quantile(full_df$serum_prostatic_acid_phosphatase, probs = c(0.75, 0.90, 0.95, 0.99), na.rm = TRUE)
       75%        90%        95%        99% 
  2.699707  18.500000  34.044922 100.617969 

The 95th percentile threshold provides a definition of unusually high AP. Patients above this cutoff are flagged as having severe biochemical evidence of disease activity.

high_ap_threshold <- quantile(full_df$serum_prostatic_acid_phosphatase, 0.95, na.rm = TRUE)
high_ap_df <- full_df |> filter(serum_prostatic_acid_phosphatase >= high_ap_threshold)

high_ap_df |> select(patient_no, serum_prostatic_acid_phosphatase, bone_metastases, stage,
                     size_of_primary_tumor)
# A tibble: 25 × 5
   patient_no serum_prostatic_acid…¹ bone_metastases stage size_of_primary_tumor
        <dbl>                  <dbl> <lgl>           <dbl>                 <dbl>
 1         24                   35.7 TRUE                4                    26
 2         33                   35.4 FALSE               4                    38
 3         36                   99.2 TRUE                4                    10
 4         68                  175.  TRUE                4                    27
 5        112                   39.7 FALSE               4                    61
 6        113                   41.7 TRUE                4                    14
 7        119                   78.4 TRUE                4                    41
 8        143                   42.9 TRUE                4                    34
 9        151                  128.  FALSE               4                    23
10        153                  225.  TRUE                4                    17
# ℹ 15 more rows
# ℹ abbreviated name: ¹​serum_prostatic_acid_phosphatase

AP and bone metastases

Because AP is strongly associated with metastatic spread as shown earlier, we assess how extreme AP values align with bone metastasis status by counting the bone_metastases in high_ap groups and draw a barplot.

high_ap_df |>
  ggplot(aes(x = bone_metastases)) +
  geom_bar(fill = "steelblue", alpha = 0.6, width = 0.5, color = "black") +
  geom_text(
    stat = "count",
    aes(label = ..count..),
    vjust = -0.2, # in the middle
    size = 5
  ) + # show counts
  labs(
    x = "Bone metastases",
    y = "Count",
    title = "Frequency of Bone Metastases"
  ) +
  theme_minimal()+
  theme(plot.title = element_text(hjust = 0.5) # keep titles in the middle
  )
Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
ℹ Please use `after_stat(count)` instead.

A large fraction of high-AP patients exhibiting bone metastasis strengthens the interpretation that AP reflects tumor burden and metastatic progression.

Survival of high-AP vs non-high-AP patients

To quantify risk, patients are stratified into high-risk (AP above the 95th percentile) and others, and compare survival curves (obtained by using the same km_fit function used earlier).

full_df <- full_df |>
  mutate(
    ap_risk_group = ifelse(serum_prostatic_acid_phosphatase >= high_ap_threshold, "High AP", "Normal AP"),
    ap_risk_group = factor(ap_risk_group)
    )

km_ap <- km_fit(
  data = full_df,
  time_col = "follow_up_months",
  event_col = "is_alive",
  group_col = "ap_risk_group"
)

# draw surv curve

(km_ap |>
  ggplot(aes(x = time, y = surv, color = group)) +
  geom_step(size = 1.2) +
  labs(
    x = "Follow-up time (months)",
    y = "Survival probability",
    title = "Survival of High-AP vs Normal-AP Patients"
  ) +
  scale_color_manual(
    values = c(
      "High AP" = "darkred",
      "Normal AP" = "darkgreen"
    )
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 14, face = "bold"),
    legend.title = element_blank()
  )) |> save_and_show("high_ap_comparison")
Saving 7 x 5 in image
Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
ℹ Please use `linewidth` instead.

Patients with extremely elevated AP typically show sharply reduced survival, highlighting AP as a powerful prognostic biomarker.

Interpretation

The exploration of outliers reveals distinct, clinically meaningful high-risk subgroups. These findings demonstrate that AP is a strong prognostic indicator and justify its inclusion in multivariable survival modeling as well as in clinical interpretation of treatment outcomes.

Conclusion

This survival analysis, based on 491 patients, successfully estimated Kaplan–Meier survival curves and fitted both univariate and multivariable Cox proportional hazards models. Kaplan–Meier comparisons demonstrated significant differences in survival probabilities among the placebo, 0.2 mg, 1.0 mg, and 5.0 mg estrogen treatment groups.

In univariate Cox analyses, Age, AP, tumor size, bone metastasis, and clinical stage were strong adverse prognostic factors, whereas hemoglobin was protective. Among the treatment arms, only the 1.0 mg estrogen dose demonstrated a significantly reduced hazard of death relative to placebo. MAP showed no significant association with survival.

In the multivariable Cox model, bone metastasis, tumor size, AP, and age remained independent predictors of increased mortality. Hemoglobin and the 1.0 mg estrogen treatment continued to show significant protective effects. MAP did not demonstrate independent prognostic value, and clinical stage lost significance after adjustment, likely reflecting the dominant explanatory role of tumor burden markers such as AP and metastatic status.

Overall, these results indicate that tumor burden biomarkers are the primary determinants of survival in this cohort, and that 1.0 mg estrogen provides a meaningful and independent therapeutic benefit.

PCA Analysis

Load library

library("tidyverse")
library("readr")
library("dplyr")
library("factoextra")
source("99_proj_func.R") # custom functions and theme

Load data

#df <- read_tsv(str_c("../data/02_dat_tidy.tsv.gz"))
#df <- as.data.frame(df)
treatment_df <- read_tsv("../data/01_treatment_dat_load.tsv")
Rows: 502 Columns: 11
── Column specification ────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): rx, ekg
dbl (9): patno, dtime, sbp, dbp, hg, sz, sg, ap, bm

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

PCA

Numeric

df_numeric <- treatment_df[, -1] |> #don't take in account patient no
  select(where(is.numeric)) |> 
  drop_na()

#pca_res <- prcomp(df_numeric, scale = TRUE)

Normalization

data_normalized <- scale(df_numeric)
head(data_normalized)
          dtime        sbp        dbp          hg         sz         sg
[1,]  1.5540601  0.2603041  0.5825557  0.19525350 -1.0151514 -1.1470768
[2,]  0.1755885 -0.1504355 -0.1029699 -0.01259105 -0.9336845 -0.6506571
[3,] -0.6859562 -0.1504355 -0.7884955  2.16724215 -0.8522176 -1.1470768
[4,]  1.2525194  1.0817833  1.2680813 -0.01259105  1.5917896 -1.1470768
[5,] -0.5136473  1.9032624  1.2680813  0.87049489 -0.3634162  0.3421823
[6,]  0.4340519 -0.1504355  1.2680813 -0.21942174 -0.1190154 -0.6506571
             ap         bm
[1,] -0.1928693 -0.4434398
[2,] -0.1928693 -0.4434398
[3,] -0.1833697 -0.4434398
[4,] -0.1897021 -0.4434398
[5,] -0.1881190 -0.4434398
[6,] -0.1849528 -0.4434398
data_pca <- princomp(data_normalized)
summary(data_pca)
Importance of components:
                          Comp.1    Comp.2    Comp.3    Comp.4     Comp.5
Standard deviation     1.4573083 1.2619281 1.0077455 0.9787729 0.87139369
Proportion of Variance 0.2660158 0.1994682 0.1272056 0.1199965 0.09511157
Cumulative Proportion  0.2660158 0.4654840 0.5926896 0.7126861 0.80779768
                           Comp.6     Comp.7    Comp.8
Standard deviation     0.80556910 0.72736353 0.5970389
Proportion of Variance 0.08128495 0.06626857 0.0446488
Cumulative Proportion  0.88908263 0.95535120 1.0000000

Visualization - Scree plot

fviz_eig(data_pca, addlabels = TRUE) |> 
save_and_show("PCA_plot")
Saving 7 x 5 in image

Visualization - PC1&PC2

fviz_pca_ind(data_pca,
             axes = c(1,2),     
             repel = TRUE,       
             col.ind = "blue", 
             select.var = list(contrib = 10))  

PCA 2 removing variable

data_pca2 <- data_normalized[, -5] 
data_pca2 <- princomp(data_normalized)
fviz_eig(data_pca2, addlabels = TRUE)

Conclusion

Even when removing variables, we see that we cannot apply PCA to our model.

#render all notebooks
library("quarto")

qmds <- list.files(pattern = "\\.qmd$")
qmds <- qmds[qmds != "00_all.qmd"] #dont render current notebook i dont know if that would create a recursive loop
qmds <- qmds[qmds != "0X_background.qmd"]

for (qmd_document in qmds) {
  quarto_render(qmd_document,
                output_format = "html",
                execute_params = list(embed_resources = TRUE))}


processing file: 01_load.qmd

  |                                                          
  |                                                    |   0%
  |                                                          
  |......                                              |  11%                  
  |                                                          
  |............                                        |  22% [unnamed-chunk-1]
  |                                                          
  |.................                                   |  33%                  
  |                                                          
  |.......................                             |  44% [unnamed-chunk-2]
  |                                                          
  |.............................                       |  56%                  
  |                                                          
  |...................................                 |  67% [unnamed-chunk-3]
  |                                                          
  |........................................            |  78%                  
  |                                                          
  |..............................................      |  89% [unnamed-chunk-4]
  |                                                          
  |....................................................| 100%                  
                                                                                                            
output file: 01_load.knit.md

Warning message:
In png(..., res = dpi, units = "in") :
  unable to open connection to X11 display ''
pandoc 
  to: html
  output-file: 01_load.html
  standalone: true
  self-contained: true
  section-divs: true
  html-math-method: mathjax
  wrap: none
  default-image-extension: png
  
metadata
  document-css: false
  link-citations: true
  date-format: long
  lang: en
  title: 01_load
  
Output created: 01_load.html



processing file: 02_tidy.qmd

  |                                                         
  |                                                   |   0%
  |                                                         
  |..                                                 |   5%                   
  |                                                         
  |.....                                              |  10% [unnamed-chunk-1] 
  |                                                         
  |.......                                            |  14%                   
  |                                                         
  |..........                                         |  19% [unnamed-chunk-2] 
  |                                                         
  |............                                       |  24%                   
  |                                                         
  |...............                                    |  29% [unnamed-chunk-3] 
  |                                                         
  |.................                                  |  33%                   
  |                                                         
  |...................                                |  38% [unnamed-chunk-4] 
  |                                                         
  |......................                             |  43%                   
  |                                                         
  |........................                           |  48% [unnamed-chunk-5] 
  |                                                         
  |...........................                        |  52%                   
  |                                                         
  |.............................                      |  57% [unnamed-chunk-6] 
  |                                                         
  |................................                   |  62%                   
  |                                                         
  |..................................                 |  67% [unnamed-chunk-7] 
  |                                                         
  |....................................               |  71%                   
  |                                                         
  |.......................................            |  76% [unnamed-chunk-8] 
  |                                                         
  |.........................................          |  81%                   
  |                                                         
  |............................................       |  86% [unnamed-chunk-9] 
  |                                                         
  |..............................................     |  90%                   
  |                                                         
  |.................................................  |  95% [unnamed-chunk-10]
  |                                                         
  |...................................................| 100%                   
                                                                                                             
output file: 02_tidy.knit.md

Warning message:
In png(..., res = dpi, units = "in") :
  unable to open connection to X11 display ''
pandoc 
  to: html
  output-file: 02_tidy.html
  standalone: true
  self-contained: true
  section-divs: true
  html-math-method: mathjax
  wrap: none
  default-image-extension: png
  
metadata
  document-css: false
  link-citations: true
  date-format: long
  lang: en
  title: 02_tidy
  
Output created: 02_tidy.html



processing file: 03_augment.qmd

  |                                                         
  |                                                   |   0%
  |                                                         
  |..                                                 |   4%                   
  |                                                         
  |....                                               |   9% [unnamed-chunk-1] 
  |                                                         
  |.......                                            |  13%                   
  |                                                         
  |.........                                          |  17% [unnamed-chunk-2] 
  |                                                         
  |...........                                        |  22%                   
  |                                                         
  |.............                                      |  26% [unnamed-chunk-3] 
  |                                                         
  |................                                   |  30%                   
  |                                                         
  |..................                                 |  35% [unnamed-chunk-4] 
  |                                                         
  |....................                               |  39%                   
  |                                                         
  |......................                             |  43% [unnamed-chunk-5] 
  |                                                         
  |........................                           |  48%                   
  |                                                         
  |...........................                        |  52% [unnamed-chunk-6] 
  |                                                         
  |.............................                      |  57%                   
  |                                                         
  |...............................                    |  61% [unnamed-chunk-7] 
  |                                                         
  |.................................                  |  65%                   
  |                                                         
  |...................................                |  70% [unnamed-chunk-8] 
  |                                                         
  |......................................             |  74%                   
  |                                                         
  |........................................           |  78% [unnamed-chunk-9] 
  |                                                         
  |..........................................         |  83%                   
  |                                                         
  |............................................       |  87% [unnamed-chunk-10]
  |                                                         
  |...............................................    |  91%                   
  |                                                         
  |.................................................  |  96% [unnamed-chunk-11]
  |                                                         
  |...................................................| 100%                   
                                                                                                             
output file: 03_augment.knit.md

Warning message:
In png(..., res = dpi, units = "in") :
  unable to open connection to X11 display ''
pandoc 
  to: html
  output-file: 03_augment.html
  standalone: true
  self-contained: true
  section-divs: true
  html-math-method: mathjax
  wrap: none
  default-image-extension: png
  
metadata
  document-css: false
  link-citations: true
  date-format: long
  lang: en
  title: 03_augment
  
Output created: 03_augment.html



processing file: 04_describe.qmd

  |                                                          
  |                                                    |   0%
  |                                                          
  |...                                                 |   5%                  
  |                                                          
  |.....                                               |  11% [unnamed-chunk-1]
  |                                                          
  |........                                            |  16%                  
  |                                                          
  |...........                                         |  21% [unnamed-chunk-2]
  |                                                          
  |..............                                      |  26%                  
  |                                                          
  |................                                    |  32% [unnamed-chunk-3]
  |                                                          
  |...................                                 |  37%                  
  |                                                          
  |......................                              |  42% [unnamed-chunk-4]
  |                                                          
  |.........................                           |  47%                  
  |                                                          
  |...........................                         |  53% [unnamed-chunk-5]
  |                                                          
  |..............................                      |  58%                  
  |                                                          
  |.................................                   |  63% [unnamed-chunk-6]
  |                                                          
  |....................................                |  68%                  
  |                                                          
  |......................................              |  74% [unnamed-chunk-7]
  |                                                          
  |.........................................           |  79%                  
  |                                                          
  |............................................        |  84% [unnamed-chunk-8]
  |                                                          
  |...............................................     |  89%                  
  |                                                          
  |.................................................   |  95% [unnamed-chunk-9]
  |                                                          
  |....................................................| 100%                  
                                                                                                            
output file: 04_describe.knit.md

Warning message:
In png(..., res = dpi, units = "in") :
  unable to open connection to X11 display ''
pandoc 
  to: html
  output-file: 04_describe.html
  standalone: true
  self-contained: true
  section-divs: true
  html-math-method: mathjax
  wrap: none
  default-image-extension: png
  
metadata
  document-css: false
  link-citations: true
  date-format: long
  lang: en
  title: 04_describe
  
Output created: 04_describe.html



processing file: 05_model.qmd

  |                                                         
  |                                                   |   0%
  |                                                         
  |.                                                  |   2%                   
  |                                                         
  |..                                                 |   3% [unnamed-chunk-1] 
  |                                                         
  |...                                                |   5%                   
  |                                                         
  |...                                                |   7% [unnamed-chunk-2] 
  |                                                         
  |....                                               |   8%                   
  |                                                         
  |.....                                              |  10% [unnamed-chunk-3] 
  |                                                         
  |......                                             |  11%                   
  |                                                         
  |.......                                            |  13% [unnamed-chunk-4] 
  |                                                         
  |........                                           |  15%                   
  |                                                         
  |........                                           |  16% [unnamed-chunk-5] 
  |                                                         
  |.........                                          |  18%                   
  |                                                         
  |..........                                         |  20% [unnamed-chunk-6] 
  |                                                         
  |...........                                        |  21%                   
  |                                                         
  |............                                       |  23% [unnamed-chunk-7] 
  |                                                         
  |.............                                      |  25%                   
  |                                                         
  |.............                                      |  26% [unnamed-chunk-8] 
  |                                                         
  |..............                                     |  28%                   
  |                                                         
  |...............                                    |  30% [unnamed-chunk-9] 
  |                                                         
  |................                                   |  31%                   
  |                                                         
  |.................                                  |  33% [unnamed-chunk-10]
  |                                                         
  |..................                                 |  34%                   
  |                                                         
  |..................                                 |  36% [unnamed-chunk-11]
  |                                                         
  |...................                                |  38%                   
  |                                                         
  |....................                               |  39% [unnamed-chunk-12]
  |                                                         
  |.....................                              |  41%                   
  |                                                         
  |......................                             |  43% [unnamed-chunk-13]
  |                                                         
  |.......................                            |  44%                   
  |                                                         
  |.......................                            |  46% [unnamed-chunk-14]
  |                                                         
  |........................                           |  48%                   
  |                                                         
  |.........................                          |  49% [unnamed-chunk-15]
  |                                                         
  |..........................                         |  51%                   
  |                                                         
  |...........................                        |  52% [unnamed-chunk-16]
  |                                                         
  |............................                       |  54%                   
  |                                                         
  |............................                       |  56% [unnamed-chunk-17]
  |                                                         
  |.............................                      |  57%                   
  |                                                         
  |..............................                     |  59% [unnamed-chunk-18]
  |                                                         
  |...............................                    |  61%                   
  |                                                         
  |................................                   |  62% [unnamed-chunk-19]
  |                                                         
  |.................................                  |  64%                   
  |                                                         
  |.................................                  |  66% [unnamed-chunk-20]
  |                                                         
  |..................................                 |  67%                   
  |                                                         
  |...................................                |  69% [unnamed-chunk-21]
  |                                                         
  |....................................               |  70%                   
  |                                                         
  |.....................................              |  72% [unnamed-chunk-22]
  |                                                         
  |......................................             |  74%                   
  |                                                         
  |......................................             |  75% [unnamed-chunk-23]
  |                                                         
  |.......................................            |  77%                   
  |                                                         
  |........................................           |  79% [unnamed-chunk-24]
  |                                                         
  |.........................................          |  80%                   
  |                                                         
  |..........................................         |  82% [unnamed-chunk-25]
  |                                                         
  |...........................................        |  84%                   
  |                                                         
  |...........................................        |  85% [unnamed-chunk-26]
  |                                                         
  |............................................       |  87%                   
  |                                                         
  |.............................................      |  89% [unnamed-chunk-27]
  |                                                         
  |..............................................     |  90%                   
  |                                                         
  |...............................................    |  92% [unnamed-chunk-28]
  |                                                         
  |................................................   |  93%                   
  |                                                         
  |................................................   |  95% [unnamed-chunk-29]
  |                                                         
  |.................................................  |  97%                   
  |                                                         
  |.................................................. |  98% [unnamed-chunk-30]
  |                                                         
  |...................................................| 100%                   
                                                                                                             
output file: 05_model.knit.md

Warning message:
In png(..., res = dpi, units = "in") :
  unable to open connection to X11 display ''
pandoc 
  to: html
  output-file: 05_model.html
  standalone: true
  self-contained: true
  section-divs: true
  html-math-method: mathjax
  wrap: none
  default-image-extension: png
  
metadata
  document-css: false
  link-citations: true
  date-format: long
  lang: en
  title: 05_model
  
Output created: 05_model.html



processing file: 06_survival_analysis.qmd

  |                                                         
  |                                                   |   0%
  |                                                         
  |..                                                 |   3%                   
  |                                                         
  |....                                               |   7% [unnamed-chunk-1] 
  |                                                         
  |.....                                              |  10%                   
  |                                                         
  |.......                                            |  14% [unnamed-chunk-2] 
  |                                                         
  |.........                                          |  17%                   
  |                                                         
  |...........                                        |  21% [unnamed-chunk-3] 
  |                                                         
  |............                                       |  24%                   
  |                                                         
  |..............                                     |  28% [unnamed-chunk-4] 
  |                                                         
  |................                                   |  31%                   
  |                                                         
  |..................                                 |  34% [unnamed-chunk-5] 
  |                                                         
  |...................                                |  38%                   
  |                                                         
  |.....................                              |  41% [unnamed-chunk-6] 
  |                                                         
  |.......................                            |  45%                   
  |                                                         
  |.........................                          |  48% [unnamed-chunk-7] 
  |                                                         
  |..........................                         |  52%                   
  |                                                         
  |............................                       |  55% [unnamed-chunk-8] 
  |                                                         
  |..............................                     |  59%                   
  |                                                         
  |................................                   |  62% [unnamed-chunk-9] 
  |                                                         
  |.................................                  |  66%                   
  |                                                         
  |...................................                |  69% [unnamed-chunk-10]
  |                                                         
  |.....................................              |  72%                   
  |                                                         
  |.......................................            |  76% [unnamed-chunk-11]
  |                                                         
  |........................................           |  79%                   
  |                                                         
  |..........................................         |  83% [unnamed-chunk-12]
  |                                                         
  |............................................       |  86%                   
  |                                                         
  |..............................................     |  90% [unnamed-chunk-13]
  |                                                         
  |...............................................    |  93%                   
  |                                                         
  |.................................................  |  97% [unnamed-chunk-14]
  |                                                         
  |...................................................| 100%                   
                                                                                                             
output file: 06_survival_analysis.knit.md

Warning message:
In png(..., res = dpi, units = "in") :
  unable to open connection to X11 display ''
pandoc 
  to: html
  output-file: 06_survival_analysis.html
  standalone: true
  self-contained: true
  section-divs: true
  html-math-method: mathjax
  wrap: none
  default-image-extension: png
  
metadata
  document-css: false
  link-citations: true
  date-format: long
  lang: en
  title: 06_survival_analysis
  
Output created: 06_survival_analysis.html



processing file: 07_PCA.qmd

  |                                                          
  |                                                    |   0%
  |                                                          
  |...                                                 |   5%                  
  |                                                          
  |.....                                               |  11% [unnamed-chunk-1]
  |                                                          
  |........                                            |  16%                  
  |                                                          
  |...........                                         |  21% [unnamed-chunk-2]
  |                                                          
  |..............                                      |  26%                  
  |                                                          
  |................                                    |  32% [unnamed-chunk-3]
  |                                                          
  |...................                                 |  37%                  
  |                                                          
  |......................                              |  42% [unnamed-chunk-4]
  |                                                          
  |.........................                           |  47%                  
  |                                                          
  |...........................                         |  53% [unnamed-chunk-5]
  |                                                          
  |..............................                      |  58%                  
  |                                                          
  |.................................                   |  63% [unnamed-chunk-6]
  |                                                          
  |....................................                |  68%                  
  |                                                          
  |......................................              |  74% [unnamed-chunk-7]
  |                                                          
  |.........................................           |  79%                  
  |                                                          
  |............................................        |  84% [unnamed-chunk-8]
  |                                                          
  |...............................................     |  89%                  
  |                                                          
  |.................................................   |  95% [unnamed-chunk-9]
  |                                                          
  |....................................................| 100%                  
                                                                                                            
output file: 07_PCA.knit.md

Warning message:
In png(..., res = dpi, units = "in") :
  unable to open connection to X11 display ''
pandoc 
  to: html
  output-file: 07_PCA.html
  standalone: true
  self-contained: true
  section-divs: true
  html-math-method: mathjax
  wrap: none
  default-image-extension: png
  
metadata
  document-css: false
  link-citations: true
  date-format: long
  lang: en
  title: PCA
  
Output created: 07_PCA.html
#move them to res dir
html_names <- sub("\\.qmd$", ".html", qmds)

#move the files
if (!dir.exists("../results")){
  dir.create("../results", recursive = TRUE)}

# move each file
file.rename(html_names, file.path("../results", basename(html_names)))
[1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE
#Render the presentation as well
quarto_render("../doc/presentation.qmd")#presentation shouldnt be moved according to proj description
pandoc 
  to: revealjs
  output-file: presentation.html
  standalone: true
  self-contained: true
  wrap: none
  default-image-extension: png
  html-math-method:
    method: mathjax
    url: >-
      https://cdnjs.cloudflare.com/ajax/libs/mathjax/2.7.0/MathJax.js?config=TeX-AMS_HTML-full
  slide-level: 2
  
metadata
  link-citations: true
  width: 1050
  height: 700
  margin: 0.1
  center: false
  navigationMode: linear
  controlsLayout: edges
  controlsTutorial: false
  hash: true
  history: true
  hashOneBasedIndex: false
  fragmentInURL: false
  transition: fade
  backgroundTransition: none
  pdfSeparateFragments: false
  lang: en
  auto-stretch: true
  title: Project presentation
  author: 'Laurine Panel, Rasmus Tøffner-Clausen, Mathias Glerup Filtenborg, Alexander Videbæk, Qiju Chen'
  date: 2 dec 2025
  footer: Project-group20
  slideNumber: true
  smaller: true
  
[WARNING] Could not fetch resource dataset.png
[WARNING] Could not fetch resource ../results/figures/background2.png
[WARNING] Could not fetch resource ../results/figures/background2.png
[WARNING] Could not fetch resource ../results/figures/background3.png
[WARNING] Could not fetch resource ../results/figures/background3.png
[WARNING] Could not fetch resource ../results/figures/background3.png
[WARNING] Could not fetch resource ../results/figures/background3.png
[WARNING] Could not fetch resource ../results/figures/background3.png
[WARNING] Could not fetch resource ../results/figures/background4.png
Output created: presentation.html
# print system info
sessionInfo()
R version 4.4.2 (2024-10-31)
Platform: x86_64-pc-linux-gnu
Running under: Ubuntu 24.04.3 LTS

Matrix products: default
BLAS:   /net/pupil2/home/ctools/opt/R-4.4.2_22100/lib/libRblas.so 
LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.12.0

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

time zone: Europe/Copenhagen
tzcode source: system (glibc)

attached base packages:
[1] stats     graphics  grDevices utils     datasets  methods   base     

other attached packages:
 [1] quarto_1.4.4      factoextra_1.0.7  survival_3.8-3    broom_1.0.7      
 [5] ggridges_0.5.6    viridis_0.6.5     viridisLite_0.4.2 lubridate_1.9.4  
 [9] forcats_1.0.0     stringr_1.5.1     dplyr_1.1.4       purrr_1.0.4      
[13] readr_2.1.5       tidyr_1.3.1       tibble_3.2.1      ggplot2_3.5.1    
[17] tidyverse_2.0.0  

loaded via a namespace (and not attached):
 [1] gtable_0.3.6      xfun_0.50         htmlwidgets_1.6.4 processx_3.8.5   
 [5] rstatix_0.7.2     ggrepel_0.9.6     lattice_0.22-6    tzdb_0.4.0       
 [9] ps_1.8.1          vctrs_0.6.5       tools_4.4.2       generics_0.1.3   
[13] parallel_4.4.2    table1_1.4.3      pkgconfig_2.0.3   Matrix_1.7-2     
[17] lifecycle_1.0.4   compiler_4.4.2    farver_2.1.2      textshaping_1.0.0
[21] munsell_0.5.1     carData_3.0-5     htmltools_0.5.8.1 yaml_2.3.10      
[25] Formula_1.2-5     later_1.4.1       car_3.1-3         pillar_1.10.1    
[29] ggpubr_0.6.0      crayon_1.5.3      abind_1.4-8       tidyselect_1.2.1 
[33] digest_0.6.37     stringi_1.8.4     labeling_0.4.3    splines_4.4.2    
[37] fastmap_1.2.0     grid_4.4.2        colorspace_2.1-1  cli_3.6.4        
[41] magrittr_2.0.3    utf8_1.2.4        withr_3.0.2       scales_1.3.0     
[45] backports_1.5.0   bit64_4.6.0-1     timechange_0.3.0  rmarkdown_2.29   
[49] bit_4.5.0.1       gridExtra_2.3     ggsignif_0.6.4    ragg_1.3.3       
[53] hms_1.1.3         evaluate_1.0.3    knitr_1.49        rlang_1.1.5      
[57] Rcpp_1.0.14       glue_1.8.0        rstudioapi_0.17.1 vroom_1.6.5      
[61] jsonlite_2.0.0    R6_2.6.1          systemfonts_1.3.1